Introduction

Understanding R Grammar

== vs %in%

x <- c('a', 'b', 'c')
y <- c('c', 'b', 'a')

# logical operator: asks the program for a match
x == y
## [1] FALSE  TRUE FALSE
search()
## [1] ".GlobalEnv"        "package:stats"     "package:graphics" 
## [4] "package:grDevices" "package:utils"     "package:datasets" 
## [7] "package:methods"   "Autoloads"         "package:base"
# value matching: contains same variables
x %in% y
## [1] TRUE TRUE TRUE
x %in% letters
## [1] TRUE TRUE TRUE

Other important functions/operators to know

df$variable (specify dataframe and variable within the dataframe)

%>% (pipe data into functions)

Example: (take Iris data, subset, and then aggregate)

iris %>% 
subset (Sepal.Length < 5) %>% 
aggregate(. ~ Species, ., mean) 

package:: (specify package before function)

lapply (apply function to object in a list using the format: lapply(object, function))

library(tidyverse)
data <- data.frame(x, y)
as_tibble(data)

Brackets

# [] subset a variable
## [,x] = column
new_data <- data [,2]
as_tibble(new_data)
## [x,] = row
new_data2 <- data[2,]
as.tibble(new_data2)
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.

Installing Packages

## Install Packages from CRAN
install.packages("dplyr")


## Install Package from GitHub
install.packages("devtools")
devtools::install_github("DeveloperName/PackageName")
devtools::install_github("RandiLGarcia/dyadr")

Loading Packages

# Load Packages
library(dplyr)

# One Way to Load Multiple Packages
pkgs <- c("psych","tidyr","tidyverse","dplyr","haven","lm.beta","car","Hmisc","skimr","janitor", "labelled", "expss", "foreign")
lapply(pkgs, library, character.only = TRUE)

Checking Packages

search()
##  [1] ".GlobalEnv"        "package:foreign"   "package:expss"    
##  [4] "package:labelled"  "package:janitor"   "package:skimr"    
##  [7] "package:Hmisc"     "package:Formula"   "package:survival" 
## [10] "package:lattice"   "package:car"       "package:carData"  
## [13] "package:lm.beta"   "package:haven"     "package:psych"    
## [16] "package:forcats"   "package:stringr"   "package:dplyr"    
## [19] "package:purrr"     "package:readr"     "package:tidyr"    
## [22] "package:tibble"    "package:ggplot2"   "package:tidyverse"
## [25] "package:stats"     "package:graphics"  "package:grDevices"
## [28] "package:utils"     "package:datasets"  "package:methods"  
## [31] "Autoloads"         "package:base"

Functions

?mean
?dplyr::mutate

Importing Datasets

## CSV
# Saved in the same folder
basic_df <- read.csv("depression_example_data.csv", stringsAsFactors = FALSE) # character strings will not be converted to factors
tibble_df <- read_csv("depression_example_data.csv") # reads as tibble

# Saved in different places
# Option 1 - Set working directory
getwd()
setwd("/Users/kareenadelrosario/Desktop/Local R Code/NewFolder")
read_csv("csvFileName.csv")

# Option 2 - Include file path
read_csv("/Users/kareenadelrosario/Desktop/Local R Code/NewFolder/csvFileName.csv")

# Option 3 - Choose file
read.csv(file.choose(), header = TRUE)

read_sav(file.choose())
read_sas(file.choose())

# Option 4 - Use Menu
# file -> Import Dataset

Data Manipulation

Intro to Dplyr

library(gapminder)

Examine Data

colnames(gapminder)
## [1] "country"   "continent" "year"      "lifeExp"   "pop"       "gdpPercap"
skim(gapminder)
Data summary
Name gapminder
Number of rows 1704
Number of columns 6
_______________________
Column type frequency:
factor 2
numeric 4
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
country 0 1 FALSE 142 Afg: 12, Alb: 12, Alg: 12, Ang: 12
continent 0 1 FALSE 5 Afr: 624, Asi: 396, Eur: 360, Ame: 300

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 1979.50 17.27 1952.00 1965.75 1979.50 1993.25 2007.0 ▇▅▅▅▇
lifeExp 0 1 59.47 12.92 23.60 48.20 60.71 70.85 82.6 ▁▆▇▇▇
pop 0 1 29601212.32 106157896.74 60011.00 2793664.00 7023595.50 19585221.75 1318683096.0 ▇▁▁▁▁
gdpPercap 0 1 7215.33 9857.45 241.17 1202.06 3531.85 9325.46 113523.1 ▇▁▁▁▁
glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …

Variable Class

Before you run any analyses, you’ll need to make sure the variable class is correct (e.g. factor vs numeric vs character).

# View class of each variable
lapply(gapminder, class)
## $country
## [1] "factor"
## 
## $continent
## [1] "factor"
## 
## $year
## [1] "integer"
## 
## $lifeExp
## [1] "numeric"
## 
## $pop
## [1] "integer"
## 
## $gdpPercap
## [1] "numeric"
# Factor = Nominal in SPSS

Intro to Dplyr

Select, Filter, Mutate, Arrange, Transmute

Select allows us to easily subset our data by only selecting the desired variables.

# new_df <-
gapminder %>%
  select (country, lifeExp)

Filter, just like the filter function in SPSS, allows us to work with the cases that meet certain criteria

DATASET COPY filtered_df.
DATASET ACTIVATE filtered_df.
FILTER OFF.
USE ALL.
SELECT IF ((continent = ‘Africa’) & (year > 1990)).
EXECUTE.
# filtered_df <-
gapminder %>%
  filter (continent == "Africa", year > 1990) %>%
  head()
# Saves in global environment

Mutate creates new variables or changes existing variables. The SPSS equivalent is COMPUTE VARIABLE.

gapminder %>%
  mutate(log.gdp = log(gdpPercap)) %>%
  head()
COMPUTE diff.gdp=gdpPercap - mean(gdpPercap).
EXECUTE.
gapminder %>%
  mutate(diff.gdp = gdpPercap - mean(gdpPercap, na.rm = TRUE)) %>%
  head()

Summarize (or summarise) calculates a single value per group. Useful for aggregating data.

gapminder %>%
  group_by(year) %>%
  dplyr::summarize(mean_gdp = mean(gdpPercap)) %>%
  arrange(desc(year))

Using a real dataset

emp_df <- read_csv("/Users/kareenadelrosario/Desktop/Local R Code/Empathy_Background_randomized.csv")

Examine Data

colnames(emp_df)
##  [1] "ID"        "Dyad"      "iri1"      "iri2"      "iri3"      "iri4"     
##  [7] "iri5"      "iri6"      "iri7"      "iri8"      "iri9"      "iri10"    
## [13] "iri11"     "iri12"     "iri13"     "iri14"     "iri15"     "iri16"    
## [19] "iri17"     "iri18"     "iri19"     "iri20"     "iri21"     "iri22"    
## [25] "iri23"     "iri24"     "iri25"     "iri26"     "iri27"     "iri28"    
## [31] "Condition"

Change Variable Class

#### Option 1
# Change ID, Dyad, and Condition to factor
emp_df$ID <- as.factor(emp_df$ID)
class(emp_df$ID)
## [1] "factor"
#### Option 2
## Useful when converting multiple variables. Note: [,X] = by column
# Convert categorical variables to factor
names <- c(1:2, 31)
emp_df[,names] <- lapply(emp_df[,names] , factor)
head(lapply(emp_df, class))
## $ID
## [1] "factor"
## 
## $Dyad
## [1] "factor"
## 
## $iri1
## [1] "numeric"
## 
## $iri2
## [1] "numeric"
## 
## $iri3
## [1] "numeric"
## 
## $iri4
## [1] "numeric"
#### Option 3
## Use dplyr to change class and apply value labels
# Label gender and condition and make sure they're classified as factors
emp_df <- 
  emp_df %>%
  mutate(ID = factor(ID),
         Dyad = factor(Dyad),
         Condition = factor(Condition, labels = c("Sad", "Control"))) # Change from 0,1 to Control and Sad

emp_df %>% 
  select(ID, Dyad, Condition) %>%
  head()

Remove Variables (Columns) and Cases (Rows)

Variables

# Delete variables
emp_df %>%
  select(-c(iri1)) %>%
  head()

Cases

# Exclude participants by dyad ID
emp_df <- emp_df[ !(emp_df$Dyad %in% c(121, 124, 158, 168, 153)), ]

# Or by row number
emp_df %>%
  slice(-c(58:59, 117:118, 137:138, 107:108))

Aggregate Data

emp_df %>%
  group_by(Dyad) %>%
  dplyr::summarize(mean_iri1 = mean(iri1))

Composite Scores

#### Option 1
reverse <- emp_df %>%
  mutate(iri3r = 6 - iri3,
         iri4r = 6 - iri4,
         iri7r = 6 - iri7,
         iri12r = 6 - iri12,
         iri13r = 6 - iri13,
         iri14r = 6 - iri14,
         iri15r = 6 - iri15,
         iri18r = 6 - iri18,
         iri19r = 6 - iri19)

#### Option 2
# rowwise tells r to go by ROW not column
# we can also reverse code in one step! (Add -6 before item)
emp_df <- emp_df %>%
  rowwise() %>%
  mutate(emp_sum = sum(c(iri1, iri2, 6-iri3, 6-iri4, iri5, iri6, 6-iri7, iri8, iri9, iri10,iri11,6-iri12,6-iri13,6-iri14,6-iri15,iri16,iri17,6-iri18,6-iri19,iri20,iri21,iri22,iri23,iri24,iri25,iri26,iri27,iri28)))

Conditional mutate (ifelse)

In SPSS: COMPUTE VARIABLE (IF)

# This dataset does not have a "partner" variable, which is super important for dyadic data. 
## Use mutate to create a "partner" variable.
emp_df <- emp_df %>%
  mutate(ID.n = as.numeric(as.character(ID)),
         Dyad.n = as.numeric(as.character(Dyad)),
         partner = ID.n - (Dyad.n * 10)) %>%
  mutate(partner = factor(partner))

# Preview variables
emp_df %>%
  select(Dyad, ID, partner)
## If we wanted to recode an existing variable, we could use this function: emp_df$Condition[emp_df$Condition == 1] <- 'Sad'

## We'll need to create a new condition variable to distinguish each individual condition (sad actors vs sad partners vs control dyad)
emp_df <- emp_df %>%
  mutate(p_cond = ifelse( (Condition %in% "Sad") & (partner %in% 1), 2,
                          ifelse( (Condition %in% "Sad") & (partner %in% 2), 1, 3))) %>%
  mutate(p_cond = factor(p_cond))

#### Sad Partner = 1
#### Sad Actor = 2
#### Control Dyad = 3

## Double-check new variable
emp_df %>%
  select(ID, Condition, partner, p_cond)
# Remove missing values from emp_df
emp_df <- emp_df %>%
  drop_na(emp_sum, Condition) %>%
  group_by (Dyad) %>%
  filter(n() == 2) %>%
  ungroup()

Restructuring Dataset

Wide vs Long Format

Wide to Long

df_wide <- read_csv("wide_df.csv")

Pivot_longer() from tidyr

#### Option 1
df_long <- pivot_longer(df_wide,
                        cols = !ID, # variables that should be left alone
                        names_to = "Year", # header of wide_df
                        values_to = "Value") # values that correspond to variable names
head(df_long)
#### Option 2
df_wide %>%
 pivot_longer(
   cols = starts_with("199"), # could specify which columns to pivot
   names_to = "Year",
   values_to = "Value",
   values_drop_na = TRUE # exclude NAs
 )

Long to wide

Pivot_wider()

df_wide2 <- pivot_wider(df_long,
                        names_from = Year,
                        values_from = Value,
                        values_fill = 999) # fill NA with 999

head(df_wide2)
### Is it identical to our other wide df?
setequal(df_wide, df_wide2)
## New names:
## * `1990` -> ...1990
## * `1992` -> ...1992
## * `1994` -> ...1994
## * `1996` -> ...1996
## New names:
## * `1990` -> ...1990
## * `1992` -> ...1992
## * `1994` -> ...1994
## * `1996` -> ...1996
## [1] TRUE

Introduction to Writing Functions

var_mean <- function(variable)
{
  mean <- sum(variable)/length(variable)
  cat("Mean = ", mean)
}

# Create data
test_data <- c(5,12,98,23,45,7,86,34)

# Now let's try it out
var_mean(test_data)
## Mean =  38.75

Using Functions for Things We Normally Can’t Do in R

depression_data <- read.csv("depression_example_data.csv")
dep_model <- glm (depression ~ intervention + weeks, 
                  data = depression_data,
                  family = binomial(),
                  na.action = na.omit) # casewise deletion

summary(dep_model)
## 
## Call:
## glm(formula = depression ~ intervention + weeks, family = binomial(), 
##     data = depression_data, na.action = na.omit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6025  -1.0572   0.8107   0.8161   1.3095  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -0.234660   1.220563  -0.192  0.84754   
## intervention  1.233532   0.414565   2.975  0.00293 **
## weeks        -0.007835   0.175913  -0.045  0.96447   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 154.08  on 112  degrees of freedom
## Residual deviance: 144.16  on 110  degrees of freedom
## AIC: 150.16
## 
## Number of Fisher Scoring iterations: 4
logistic_r <- function(LogModel) {
    dev <- LogModel$deviance 
    nullDev <- LogModel$null.deviance 
    modelN <-  length(LogModel$fitted.values)
    R.l <-  1 -  dev / nullDev
    R.cs <- 1- exp ( -(nullDev - dev) / modelN)
    R.n <- R.cs / ( 1 - ( exp (-(nullDev / modelN))))
    cat("Pseudo R^2 for logistic regression\n")
    cat("Hosmer and Lemeshow R^2  ", round(R.l, 3), "\n")
    cat("Cox and Snell R^2        ", round(R.cs, 3), "\n")
    cat("Nagelkerke R^2           ", round(R.n, 3),    "\n")
      }
logistic_r(dep_model)
## Pseudo R^2 for logistic regression
## Hosmer and Lemeshow R^2   0.064 
## Cox and Snell R^2         0.084 
## Nagelkerke R^2            0.113

Reshape Data to Pairwise

Adapted from: https://www.ethan-young.com/posts/restructuring-dyadic-data/

# Import data
df_shape <- read_spss("miis_data_dict_convertedvariables_updated.sav") %>%
    dplyr::rename(dyad_id = Dyad.ID, condition = dyad_cond_ec, role = role_ec,
                  share = tick_share,
                  keep = tick_keep,
                  pshare = tick_part_share,
                  pkeep = tick_part_keep) %>% 
    dplyr::rename_all(tolower)
## re-encoding from UTF-8
df_shape1 <- df_shape %>%
  select(-c(v1,as.numeric, dyad_cond:dyad_cond_w, role_d:neg_role,dic_dist_1:dic_part_2.0,meta_tick_part_keep,id))

colnames(df_shape1)
## [1] "dyad_id"   "condition" "role"      "keep"      "share"     "pshare"   
## [7] "pkeep"

What’s the structure of our dataset?

# Preview df and arrange by Dyad ID
df_shape1 %>%
  arrange(dyad_id)

Role: -1 = partner, 1 = actor

Condition: -1 = control, 1 = experimental

keep: How many raffle tickets would you like to KEEP for yourself?

share: How many raffle tickets would you like to GIVE to your partner?

pshare: How many raffle tickets do you think your PARTNER would GIVE to you?

pkeep: How many raffle tickets do you think your PARTNER would KEEP for themselves?

Individual to Dyad

shape_dyad <- df_shape1 %>%                              
  arrange(dyad_id) %>%  # sort by dyad ID (optional)                           
  gather(key,value, # key = category, value = measurement (names are arbitrary)
         -dyad_id, # don't gather items with (-). These items will repeat.
         -condition,
         -role) %>%    
  mutate(role = ifelse(role == 1,"a","p")) %>% # if role = 1, label with 'a'=actor. 'p'=partner
  unite(new_key,key,role,sep = "_",remove=T) %>% # basically tells it to have role act as key
  spread(new_key,value) # spread key value (role) into new columns

shape_dyad

Individual to Pairwise

pair_shape <- df_shape1 %>% 
  split(.$dyad_id) %>% # create mini dfs by dyad id
  map_df(function(x){ # create a function (x) that applies to each of these dfs
    
    # Separating out actor and target
    actor <- x %>% 
    mutate(act.par = ifelse(role == 1,"s","o")) %>% # act.par = if 1, self = actor
      gather(key,value,
             -dyad_id,
             -condition,
             -act.par) %>% 
      unite(new_key,key,act.par) %>% 
      spread(new_key,value)
    
    partner <- x %>% 
      mutate(act.par = ifelse(role == 1,"o","s")) %>% # act.par = if 1, other = actor
      gather(key,value,
             -dyad_id,
             -condition,
             -act.par) %>% 
      unite(new_key,key,act.par) %>% 
      spread(new_key,value)
    
    bind_rows(actor, partner) # now combine these actor and partner dfs
  }) %>%
  mutate(partnum = ifelse(role_s == 1,1,2)) %>% # partnum: actor = 1, partner = 2
  select(dyad_id,partnum,condition,role_s, ends_with("_s"),ends_with("_o"))


pair_shape %>%
  select(dyad_id, 
         partnum, 
         condition, 
         role_s, 
         role_o, 
         keep_s,
         share_s,
         pkeep_s,
         pshare_s,
         keep_o,
         share_o,
         pkeep_o,
         pshare_o)

Dyad to Pairwise

dyad_pair_shape <- shape_dyad %>%
  gather(key,value,-dyad_id,-condition) %>%         # 
  mutate(role = ifelse(str_detect(key,"_a"),1,-1),  # Going back to individual level
         key    = str_replace(key,"_a|_p","")) %>%  # 
  spread(key,value) %>%                             # 
  split(.$dyad_id) %>%                                  #
  map_df(function(x){                                   #
                                                        #
    actor <- x %>%                                      #
    mutate(act.par = ifelse(role == 1,"s","o")) %>%     # 
      gather(key,value,                                 # individual 
             -dyad_id,                                  # to
             -condition,                                # pairwise
             -act.par) %>%                              #
      unite(new_key,key,act.par) %>%                    #
      spread(new_key,value)                             #
    
    partner <- x %>% 
      mutate(act.par = ifelse(role == 1,"o","s")) %>% 
      gather(key,value,
             -dyad_id,
             -condition,
             -act.par) %>% 
      unite(new_key,key,act.par) %>% 
      spread(new_key,value)
    
    bind_rows(actor, partner) 
  }) %>%
  mutate(partnum = ifelse(role_s == 1,1,2)) %>% 
  select(dyad_id,partnum,condition,role_s, ends_with("_s"),ends_with("_o"))

dyad_pair_shape

Is it identical to our other pairwise df?

setequal(pair_shape,dyad_pair_shape)
## [1] TRUE

Data Visualization

Exam and festival datasets are from: https://studysites.sagepub.com/dsur/study/articles.htm

library(ggplot2)
library(reshape)
library(plyr)

There are three essential grammatical elements: data, aesthetics, and geometries.

1. The data is obviously the data which we want to plot.

2. The aesthetics layer refers to the scales onto which we will map our data.

3. The geom layer refers to the actual shape the data will take in the plot.

mtcars$cyl <- as.factor(mtcars$cyl)
# Data, aethetics (x, y), point
ggplot(mtcars, aes(cyl, mpg)) +
  geom_point()

# Change the color aesthetic to a size aesthetic
ggplot(mtcars, aes(wt, mpg, color = disp, size = disp)) +
  geom_point()

Aesthetics using aes()

As a general rule, if you want to set an aesthetic to a specific value, you would specify that outside of aes(). For example, if you specify (color = “blue”), you would not place it in aes(). However, if you want to specify how the aesthetics should be used, you would place it inside aes(). For example, if you want gender to be represented as separate colors, you would use (aes(color = gender)).

Scatterplots

Aesthetic Options: geom_point ()

Shape, color, size, fill, alpha

examData <- read.delim("Exam Anxiety.dat",  header = TRUE)
#Simple scatter
scatter <- ggplot(examData, aes(Anxiety, Exam))
scatter + geom_point() + labs(x = "Exam Anxiety", y = "Exam Performance %") 

Aesthetic Options: geom_smooth ()

Color, size, fill, linetype, weight, alpha

#Simple scatter with smooth with CI
scatter <- ggplot(examData, aes(Anxiety, Exam))
scatter + geom_point() + geom_smooth() + labs(x = "Exam Anxiety", y = "Exam Performance %") 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Simple scatter with regression line
scatter <- ggplot(examData, aes(Anxiety, Exam))
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "Exam Anxiety", y = "Exam Performance %") 

#Simple scatter with regression line + CI
scatter <- ggplot(examData, aes(Anxiety, Exam))
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red") + labs(x = "Exam Anxiety", y = "Exam Performance %") 

#Simple scatter with regression line + coloured CI
scatter <- ggplot(examData, aes(Anxiety, Exam))
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", alpha = 0.1, fill = "Red") + labs(x = "Exam Anxiety", y = "Exam Performance %") 

#Grouped scatter with regression line + CI
scatter <- ggplot(examData, aes(Anxiety, Exam, colour = Gender))

scatter + geom_point() + geom_smooth(method = "lm", aes(fill = Gender), alpha = 0.1) + labs(x = "Exam Anxiety", y = "Exam Performance %", colour = "Gender")

Change colors

scatter2 <- scatter + geom_point() + geom_smooth(method = "lm", aes(fill = Gender), alpha = 0.1) + labs(x = "Exam Anxiety", y = "Exam Performance %", colour = "Gender")

# Scatter plot
scatter2 + scale_color_manual(values=c("blue", "green"))

# Change line color
# http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
scatter2 + scale_color_manual(values=c("deepskyblue1", "darkseagreen1"))

# Change panel color
scatter2 + theme(panel.background = element_rect(fill = 'white'))

# Change plot color
scatter2 + theme(plot.background = element_rect(fill = 'black', color = 'black'),
                 panel.background = element_rect(fill = 'black'),
                 axis.title = element_text(color = 'white'))

Histograms

Aesthetic Options: geom_histogram ()

Color, size, fill, linetype, weight, alpha

festivalData <- read.delim("DownloadFestival(No Outlier).dat",  header = TRUE)
festivalHistogram <- ggplot(festivalData, aes(day1))

festivalHistogram + geom_histogram(binwidth = 0.4) + labs(x = "Hygiene (Day 1 of Festival)", y = "Frequency")

# install.packages("extrafont")
library(extrafont)

# Change color and font
festivalHistogram + geom_histogram(binwidth=0.2,color="black", fill="lightskyblue2") + theme(text = element_text(size = 12, family = "Comic Sans MS")) + facet_wrap("gender")

Density Plots

festivalDensity <- ggplot(festivalData, aes(day1))
festivalDensity + geom_density() + labs(x = "Hygiene (Day 1 of Festival)", y = "Density Estimate")

# Density by gender
festivalDensity + geom_density() + aes(fill = gender)

# Change opacity and labels
festivalDensity + geom_density(aes(fill = gender), alpha = 0.4) + labs(x = "Hygiene (Day 1 of Festival)", y = "Density Estimate")

Boxplots

Aesthetic Options: geom_boxplot ()

Color, size, fill, weight, alpha

festivalBoxplot2 <- ggplot(festivalData, aes(gender, day1))
festivalBoxplot2 + geom_boxplot() + labs(x = "Gender", y = "Hygiene (Day 1 of Festival)")

festivalBoxplot2 + geom_boxplot(aes(fill = gender)) + geom_point() + labs(x = "Gender", y = "Hygiene (Day 1 of Festival)")

festivalBoxplot2 + geom_boxplot(aes(fill = gender)) + geom_jitter(alpha = 0.2) + labs(x = "Gender", y = "Hygiene (Day 1 of Festival)")

Bar Charts

Stat_summary () comes from the Hmisc package and can be added a layer to your graphs.

df_shape1$condition[df_shape1$condition == -1] <- "Control"
df_shape1$condition[df_shape1$condition == 1] <- "Experimental"
df_shape1$role[df_shape1$role == -1] <- "Partner"
df_shape1$role[df_shape1$role == 1] <- "Actor"
bar <- ggplot(df_shape1, aes(condition, share))

bar2 <- bar + 
  stat_summary(aes(condition, share, fill = role ), 
                           fun = mean, # display the means
                           geom = "bar", 
                           position="dodge") + 
  stat_summary(aes(condition, share, fill = role ), 
               fun.data = mean_cl_normal, # 95% CI assuming normality (other option would be _boot)
               geom = "errorbar", 
               position=position_dodge(width=0.90), 
               width = 0.2) + 
  labs(x = "Condition", y = "Money Shared", fill = "Role") + 
  scale_fill_manual(values=c("deepskyblue1", "slategray3"))
## Warning: Ignoring unknown aesthetics: fill
bar2

library(ggsignif)
bar <- ggplot(df_shape1, aes(condition, share))

bar2 + 
  scale_y_continuous(breaks=seq(0, 15, 1)) + # specifies breaks (0-15 at every 1pt) 
  coord_cartesian(ylim =c(0, 15)) + # y-axis on 0-15 scale
  geom_signif(y_position = c(7.6, 8.5), xmin = c(0.8, 1.8), xmax = c(1.2, 2.2), #sig bars between roles
              annotation = c("NS", "**"), tip_length = 0, color= "#756F6F") + 
  geom_signif(comparisons = list(c("Control", "Experimental")), map_signif_level=TRUE, #sig bars between condition 
              annotations = "NS", y_position = 11, color= "#756F6F")  + theme_classic()

Line Graphs

# Simulate meaningful fake data
## Depression over time with/without treatment
fake_data3 <- sample(c(0,1), size = nrow(depression_data), replace = TRUE)
fake_data3 <- as.data.frame(ifelse(fake_data3==1, yes = rnorm(20, 20, 1), no = rnorm(20, 20, 1)))
dep_data <- cbind(depression_data, fake_data3)
names(dep_data)[4] <- "Baseline"

fake_data <- sample(c(0,1), size = nrow(depression_data), replace = TRUE)
fake_data <- as.data.frame(ifelse(fake_data==1, yes = rnorm(20, 20, 2), no = rnorm(20, 12, 2)))
names(fake_data)[1] <- "Six_Weeks"
fake_data <- fake_data %>% arrange(Six_Weeks)
dep_data <- dep_data %>% arrange(desc(intervention))
dep_data <- cbind(dep_data, fake_data)

dep_data$intervention[dep_data$intervention == 0] <- "No Intervention"
dep_data$intervention[dep_data$intervention == 1] <- "Intervention"

# Reshape wide to long
dep_data$ID <- seq_along(dep_data[,1])

dep_data1 <- dep_data %>%
  select(-c(depression, weeks)) %>%
  melt(id = c("ID", "intervention"), measured = c("Baseline", "Six_Weeks")) %>%
  dplyr::rename(Time = variable, Depression_Level = value) %>%
  arrange(ID)
line <- ggplot(dep_data1, aes(Time, Depression_Level, color = intervention))
line + stat_summary(fun = mean, geom = "line", aes(group = intervention))

line + stat_summary(fun = mean, geom = "line", aes(group= intervention)) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2) + labs(x = "Time", y = "Depression", colour = "Intervention") + ylim(5, 30)

line + 
  stat_summary(fun = mean, geom = "point", aes(shape = intervention), size = 4) + # Shape of point by group
  stat_summary(fun = mean, geom = "line", aes(group= intervention, linetype = intervention)) + # Dashed or solid line by group
  stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2) + # 95% CI
  labs(x = "Time", y = "Mean Depression Score", colour = "Group", shape= "Group", linetype = "Group") + ylim(5, 30) # Labels and range of y-axis

Exploratory graphs

library(RColorBrewer)
library(gapminder)
library(dplyr)
gm2007.1 <- gapminder %>%
  filter(year == 2007) %>%
  slice_max(lifeExp, n = 10)

gm2007.2 <- gapminder %>%
  filter(year == 2007) %>%
  slice_min(lifeExp, n = 10)

gm2007 <- rbind(gm2007.1, gm2007.2)

# Add a geom_segment() layer
ggplot(gm2007, aes(x = lifeExp, y = country, color = lifeExp)) +
  geom_point(size = 4) +
  geom_segment(aes(xend = 30, yend = country), size = 2)

# Set the color scale
palette <- brewer.pal(5, "RdYlBu")[-(2:4)]

global_mean <- mean(gm2007$lifeExp)
x_start <- global_mean + 3
y_start <- 13
x_end <- global_mean
y_end <- 13.5

# Add a title and caption
plt_country_vs_lifeExp <- ggplot(gm2007, aes(x = lifeExp, y = country, color = lifeExp)) +
  geom_point(size = 4) +
  geom_segment(aes(xend = 30, yend = country), size = 2) +
  geom_text(aes(label = round(lifeExp,1)), color = "white", size = 1.5) +
  scale_x_continuous("", expand = c(0,0), limits = c(30,90), position = "top") +
  scale_color_gradientn(colors = palette) +
  labs(title = "Highest and lowest life expectancies, 2007", caption = "Source: gapminder")

plt_country_vs_lifeExp +
  theme_classic() +
  theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text = element_text(color = "black"),
        axis.title = element_blank(),
        legend.position = "none") +
  geom_vline(xintercept = global_mean, color = "grey40", linetype = 3) +
  annotate("text", x = x_start, y = y_start, label = "The\nglobal\naverage", vjust = 1.1, size = 3, family = "Times", color = "grey40") +
  annotate("curve", x = x_start, y = y_start, xend = x_end, yend = y_end, arrow = arrow(length = unit(0.1, "cm"), type = "closed"), color = "grey40"
  )  +
  theme(text = element_text(family = "Times"))

Animated Graphs

Code from: https://gganimate.com/

require(gganimate)
## Loading required package: gganimate
ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, colour = country)) +
  geom_point(alpha = 0.7, show.legend = FALSE) +
  scale_colour_manual(values = country_colors) +
  scale_size(range = c(2, 12)) +
  scale_x_log10() +
  facet_wrap(~continent) +
  # Here comes the gganimate specific bits
  labs(title = 'Year: {frame_time}', x = 'GDP per capita', y = 'life expectancy') +
  transition_time(year) +
  ease_aes('linear')